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Abstract 

We give an implementation of a statistical model, which can be successfully applied for 
compressing of a sequence of binary digits with behavior close to random. 

1 Introduction 

It's well known that in most cases Huffman encoding is not the optimal encoding of a stream of 
characters of an alphabet associatd with a probabilistic model pi . Although a Huffman encoded 
sequence of data can look like "random" , most likely it is not a real noise- like the one can get from 
/dev/random in some UNIX(-like) systems. In the present article we give a "proof of concept" 
implementation of a statistical model capable of detecting the deviation of the data being encoded 
from the notion of "real noise" . 

2 The Model 

We break the input data into consequtive blocks B±, B2, -B3, . . . of size N bits each. Each block 
Bi is being encoded in the following way: 

1. Count the number of l's fcj in Bi. 

2. Let ci = ki, Co = N — fcj. The probability to encounter a in the block is cq/N, and for 1 is 
ci/N 

3. Encode a bit of the block with the calculated probabilities. Depending on its value we 
decrease either cq or c\. 

4. Proceed with all the bits of the block. Note: obviously the last bit won't need to be encoded, 
because we know its value with probability 1. Anyway, a decently implemented arithmetic 
encoder should not encode anything when given probability 1. 
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The sequence ki is also a data stream, which needs to be encoded for obvious reasons. The most 
reasonable scheme for its encoding is 0-order adaptive statistical model + arithmetic encoding. 
For "real noise" ki is supposed to have a binomial distribution: 



P(k) = 



Nl 



2 N k\(N - k)\ 



(1) 



and "slightly compressible data" ("not-so-real noise") could be expected to have some deviation 
from this distribution. We make a plausible conjecture that data whose ki is deviating from (Q), 
is compressible. An example of such deviation is shown in the figure. 




Figure 1: The distribution of fcj for the file xscreensaver-4.00.tar.gz compared to binomial distri- 
bution 



3 Source Code 

We provide the C++ source code for the implementation of the algorithm described above. The 
remaining source modules- implementation of arithmetic encoding, front end, error handling etc., 
necessary to compile a working program, are not included, but easy to implement jy. 

#include "bitclasses .h" 
/*#undef TRACE_*/ 



long f ilelen(FILE*f ) 
{ 

long len,pos; 

pos=ftell(f ) ; 

f seek (f , OL , SEEK_END) ; 

len=ftell(f ) ; 

fseek(f ,pos,SEEK_SET) ; 

return len; 
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static long double facto (int k){ 
long double res; 
res=l ; 

while (k>l) res*=k — ; 
return res; 

} 

class chistomodel{ 
public : 

int n; 

int *tab; 

float pro (int k) 

{ 

return facto (n) / (facto (k) *f acto (n-k) *2 . 0) ; 
} 

"chistomodel () { 
if (tab) 

delete tab; 

} 

chistomodel () {tab=NULL; } 
chistomodel (int n) 
{tab=NULL;init(n) ;} 
void init(int n){ 
} 

>; 

#if 1 

#define B_SIZE 32 
#else 

#define B_SIZE 32 
#endif 

#define BITSINBLDCK (B_SIZE*8) 
class ccmodel{ 
public : 

int tab [BITSINBLOCK+2] ; 

unsigned short scale () {return tab [BITSINBL0CK+1] ; } 
ccmodel () {init () ; } 
void initO 
{ 

int i ; 

f or (i=0 ; i<=BITSINBLOCK+l ; i++) 
tab[i]=i; 

} 

void update (int c) 
{ 

int i ; 

if (tab [BITSINBL0CK+1] >=MAXSCALE) 

rescaleO ; 
f or (i=c+l ; i<=BITSINBLDCK+l ; i++) 
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tab[i]++; 

} 

void rescaleO 
{ 

unsigned weights [BITSINBLOCK+1] ; 
int i ; 

f or (i=0 ; i<BITSINBLOCK+l ; i++) 
{ 

weights [i] = (tab [i+1] -tab [i] ) »1 ; 
if ( ! weights [i] ) 
weights [i] =1 ; 

} 

f or (i=0 ; i<BITSINBLOCK+l ; i++) 
tab [i+1] =tab [i] +weights [i] ; 

> 

void chartosymb(int c,csymbol*s) 
{ 

s->low_count=tab [c] ; 
s->high_count=tab[c+l] ; 
s->scale=tab [BITSINBLOCK+1] ; 
} 

int counttochar (short int count , csymbol*s) 
{ 

int l,h,m; 
1=0; 

h=BITSINBLOCK+l; 
while (h-l>l) 
{ 

m=(h+l)»l; 

if (tab [m] <=count) 

l=m; 
else 

h=m; 

} 

s->low_count=tab [1] ; 
s->high_count=tab [1+1] ; 
s->scale=tab [BITSINBLOCK+1] ; 
return 1; 
} 

>; 

extern int bittable [] ; 
unsigned char block [B_SIZE] ; 
int br; 

ccmodel countmodel; 

void decode_block(carithmeticdecoder&ari) 
{ 

int s , count ; 

int bitlcnt ,bit0cnt ; 

unsigned mask; 



4 



csymbol symb ; 

count=ari . get _current_count (countmodel . scale () ) ; 
bitlcnt=countmodel . counttochar (count ,&symb) ; 
ari . remove_symbol_f rom_stream(symb. low_count , 

symb . high_count , 

symb . scale) ; 
bit0cnt=br*8-bitlcnt ; 
countmodel .update(bitlcnt) ; 
for(s=0;s<br;s++) 
{ 

block [s]=0; 

for (mask=0x80 ; mask ; mask>>= 1 ) 
{ 

count=ari . get_current_count (bitOcnt+bitlcnt) ; 

if (count<bitOcnt) 

{ 

ari . remove_symbol_f rom_stream(0, 
bitOcnt , 

bitOcnt+bitlcnt) ; 

bitOcnt — ; 

} 

else 
{ 

block [s] |=mask; 

ari . remove_symbol_f rom_stream(bitOcnt , 
bitOcnt+bitlcnt , 
bitOcnt+bitlcnt) ; 

bitlcnt — ; 

} 

} 

} 

} 

void encode_block(carithmeticencoder&ari) 
{ 

int i ; 

int bitlcnt=0, bitOcnt ; 

int mask; 

csymbol symb; 

f or(i=0; i<br; i++) 

bit lent +=bittable [block [i]] ; 

bit0cnt=br*8-bitlcnt ; 
#if 1 
#if 1 

f printf (stdout , "°/.d\n" , bitlcnt) ; 
#else 

fputc (bitlcnt , stdout) ; 
#endif 
#endif 
#if 
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if (bitlcnt>255) 

fprintf (stderr, "$°/„d\n" .bitlcnt) ; 
#endif 

countmodel . chartosymb (bit lent ,&symb) ; 
ari . encode_symbol(symb . low_count , 

symb . high_count , 

symb . scale) ; 
countmodel .update(bitlcnt) ; 
for(i=0;i<br;i++) 
{ 

f or (mask=0x80 ;mask;mask>>=l) 
if (block [i] &mask) 
{ 

symb . low_count=bitOcnt ; 

symb . high_count=bitOcnt+bitlcnt ; 

symb . scale=bitOcnt+bitlcnt ; 

ari . encode_symbol (symb . low_count , 

symb . high_count , 

symb . scale) ; 

bitlcnt — ; 
} 

else 
{ 

symb . low_count=0 ; 

symb . high_count=bitOcnt ; 

symb . scale=bitOcnt+bitlcnt ; 

ari . encode_symbol (symb . low_count , 

symb . high_count , 

symb. scale) ; 

bitOcnt — ; 
} 

} 

} 

void do_compress() 
{ 

long length, 1; 

coutbitstream outstr; 

carithmeticencoder ari; 

l=length=f ilelen(inf ile) ; 

fwrite(&length,sizeof (length) ,l,outf ile) ; 

outstr. init(outf ile) ; 

ari . init (feoutstr) ; 

while (1) 

{ 

br=f read (block, l,B_SIZE,inf ile) ; 
l-=br; 

encode_block(ari) ; 

if ((br<B_SIZE) I I (1<D) 

break; 
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} 

ari . f lushO ; 
outstr . f lushO ; 

} 

void do_decompress () 
{ 

long length, 1; 
cinbitstream instr; 
carithmeticdecoder ari; 
csymbol symb ; 

f read(&length, sizeof (length) , 1 , inf ile) ; 
l=length; 

instr . init (inf ile) ; 
ari . init (feinstr) ; 
while (1>0) 
{ 

br=KB_SIZE?l:B_SIZE; 
l-=br; 

decode_block(ari) ; 

f write (block, 1 ,br , out file) ; 

> 

} 

int bittable[]={ 

0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3, 
4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4, 
4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4, 
5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5, 
4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3, 
4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5, 
5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8 

>; 
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